Viva and Yaohan work together on the scripts, and then finish the write-up separately.
Analysis goal:
Methods for data collection:
Data source:
Study area:
Housing unit in Seattle, exclude one housing unit with an extremely high
price over 7 million.
Import all datasets
# read Seattle boundary
seattle <- st_read(here::here("data/raw/Boundary/Seattle_City.geojson")) %>%
st_union() %>%
st_as_sf()
# import house data
kc_hh<- read_csv(here::here("data/raw/kc_house_data.csv"))
# load census tract data
census_api_key("3ec2bee8c227ff3f9df970d0ffbb11ee1384076e", install = TRUE, overwrite = TRUE)
acs_variable_list.2015 <- load_variables(2015, # year
"acs5", # five year ACS estimates
cache = TRUE)
acs_vars <- c("B01003_001E", # total population
"B01001A_001E", # white alone
"B01001_003E", # male under 5
"B01001_004E", # male 5-9
"B01001_005E", # male 10-14
"B01001_020E", # male 65-66
"B01001_021E", # male 67-69
"B01001_022E", # male 70-74
"B01001_023E", # male 75-79
"B01001_024E", # male 80-84
"B01001_025E", # male over 85
"B01001_027E", # female under 5
"B01001_028E", # female 5-9
"B01001_029E", # female 10-14
"B01001_044E", # female 65-66
"B01001_045E", # female 67-69
"B01001_046E", # female 70-74
"B01001_047E", # female 75-79
"B01001_048E", # female 80-84
"B01001_049E", # female over 85
"B15003_001E", # educational attainment over 25
"B15003_022E", # bachelor's degree
"B19013_001E", # median household income
"B23025_004E", # employed labor force
"B23025_003E", # total labor force
"B17020_002E") # income below poverty level
# read amenities data
sub <- st_read(here::here("data/raw/Amenities/Metro_Sub_Stations_in_King_County___sub_stations_point.geojson"))
sch <- st_read(here::here("data/raw/Amenities/Seattle_School_Board_Director_Districts___dirdst_area.geojson"))
park<-st_read(here::here("data/raw/Amenities/Parks_in_King_County___park_area.geojson"))
tree_canopy_2016 <- st_read(here::here("data/raw/Amenities/Tree_Canopy.geojson"))
med <- st_read(here::here("data/raw/Amenities/Medical_Facilities_including_Hospitals___medical_facilities_point.geojson"))
mark<- st_read(here::here("data/raw/Amenities/King_County_Landmarks___landmark_point.geojson"))
# read spatial data
neigh_large <- st_read(here::here("data/raw/Boundary/Neighborhood_Map_Atlas_Districts.geojson"))
neigh_small <- st_read(here::here("data/raw/Boundary/Neighborhood_Map_Atlas_Neighborhoods.geojson"))
# create house location
kc_hh <- kc_hh %>%
st_as_sf(coords = c("long", "lat"), crs = 4326, agr = "constant") %>% # convert to sf object with specified CRS
st_transform(st_crs(seattle)) %>% # transform coordinate reference system to "Washington State Plane North"
distinct() # keep only distinct geometries
hh <- kc_hh %>%
st_intersection(seattle)
# examine house id
hh.id<-length(unique(hh$id))#6691 < 6740
# add an unique key
hh$key <- 1:nrow(hh)
# exclude outliers with extremely high price
hh <- hh %>%
filter(!price > 5000000)
ggplot() +
geom_sf(data = seattle, aes(fill = "Seattle Boundary"), color = NA) + # Use fill for polygon, label "Legend"
geom_sf(data = hh, aes(color = "Housing Units", shape = "Housing Units"), size = 0.5) + # Use color and shape for points, label "Legend"
labs(title = "Housing Unit Locations in Seattle",
color = "Legend", # This now acts as the legend title for both fill and color
fill = NULL, # Hide separate fill legend
shape = NULL) + # Hide separate shape legend, if unnecessary
scale_fill_manual(values = c("Seattle Boundary" = "grey90")) + # Set polygon fill color
scale_color_manual(values = c("Housing Units" = "#2166ac"), name = "Legend") + # Set point color and unified legend title
scale_shape_manual(values = c("Housing Units" = 16), guide = FALSE) + # Set point shape, hide shape guide if it's redundant
theme_void() +
theme(plot.title = element_text(hjust = 0.5, face = "bold"),
legend.position = "right") # Customize legend position
1. Internal Characteristics
- Year used (continuous)
- Renovation (dummy)
- Bedroom (continuous + category)
- Bathroom (continuous + category)
- Floors (continuous + category)
- Living area (continuous)
- Lot area (continuous)
- Waterfront (dummy)
- View (category)
- Condition (category)
- Grade (category)
# add year_used and renovation and ensure categorical data are factor
house <- hh %>%
mutate(year_used = 2015 - yr_built, # used year
reno_dum = as.factor(if_else(yr_renovated>0, 1, 0)),#renovation yes or no
water_dum = as.factor(waterfront),
view_cat = as.factor(view),
condition_cat = as.factor(condition),
grade_cat = as.factor(grade))
# create categorical data by the mean of price
## bed categories
house$bed.factor <- factor(house$bedrooms, levels =sort(unique(house$bedrooms)))
plotMean.bedrooms <- house %>%
st_drop_geometry() %>%
group_by(bed.factor)%>%
summarize(price_m = mean(price))%>%
ggplot(aes(x = bed.factor, y = price_m)) +
geom_col(position = "dodge")+
plotTheme() + theme(axis.text.x = element_text(angle = 45, hjust = 1)) #0-3,4-7,8+
## bathroom category
house$bath.factor <- factor(house$bathrooms, levels =sort(unique(house$bathrooms)))
plotMean.bathrooms<-house %>%
st_drop_geometry() %>%
group_by(bath.factor)%>%
summarize(price_m = mean(price))%>%
ggplot(aes(x = bath.factor, y = price_m)) +
geom_col(position = "dodge")+
plotTheme() + theme(axis.text.x = element_text(angle = 45, hjust = 1)) #0-4, 4+
## floor category
house$floor.factor <- factor(house$floors, levels =sort(unique(house$floors)))
plotMean.floors <- house %>%
st_drop_geometry() %>%
group_by(floor.factor)%>%
summarize(price_m = mean(price))%>%
ggplot(aes(x = floor.factor, y = price_m)) +
geom_col(position = "dodge")+
plotTheme() + theme(axis.text.x = element_text(angle = 45, hjust = 1)) #1-2+3, 2.5+3.5
## reclassify grade
plotMean.grade <- house %>%
st_drop_geometry() %>%
group_by(grade)%>%
summarize(price_m = mean(price))%>%
ggplot(aes(x = grade, y = price_m)) +
geom_col(position = "dodge")+
plotTheme() + theme(axis.text.x = element_text(angle = 45, hjust = 1))#4-9, 10-14
# add categories
house <- house %>%
mutate(
bed_cat = factor(case_when(
bedrooms <=3 ~ "few",
bedrooms >3 & bedrooms <= 7 ~ "medium",
bedrooms >=8 ~ "many"
)),
bath_dum = factor(case_when(
bathrooms <= 4 ~ "few",
bathrooms > 4 ~ "many"
)),
floor_cat = factor(case_when(
floors <= 2 | floors == 3 ~ "regular",
floors %in% c(2.5, 3.5) ~ "irregular"
)),
grade_dum = factor(ifelse(grade <= 9, "low","high"),
levels = c("low","high")))
# select variables
house <- house %>%
select("key", "price","year_used","reno_dum",
"bedrooms", "bed_cat", "bathrooms", "bath_dum",
"sqft_living", "sqft_lot", "floor_cat",
"water_dum", "view_cat", "condition_cat", "grade_dum")
2. Socio-economic Characteristics
- Population density (continuous)
- White population share (continuous)
- Age structure (continuous)
- Education level (continuous + dummy)
- Median household income (continuous + dummy)
- Employment rate (continuous + dummy)
- Poverty rate (continuous + dummy)
acsTractsSeattle.2015 <- get_acs(geography = "tract",
year = 2015,
variables = acs_vars,
geometry = TRUE,
state = "Washington",
county = "King",
output = "wide") %>%
st_transform(st_crs(seattle)) %>%
select(GEOID, NAME, all_of(acs_vars)) %>%
rename(total_pop = B01003_001E,
white_pop = B01001A_001E,
edu_bach = B15003_022E,
edu_attain = B15003_001E,
median_hh_income = B19013_001E,
total_labor = B23025_003E,
employ_labor = B23025_004E,
poverty = B17020_002E) %>%
mutate(area = st_area(.)) %>%
mutate(area = set_units(x = area, value = "acres"))%>%
mutate(pop_den = ifelse(as.numeric(area) > 0, total_pop / area, 0),
white_share = round(ifelse(total_pop > 0, white_pop / total_pop, 0) * 100, digits = 2),
pop_under14 = B01001_003E + B01001_004E + B01001_005E + B01001_027E +
B01001_028E + B01001_029E,
pop_over65 = B01001_020E + B01001_021E + B01001_022E + B01001_023E +
B01001_024E + B01001_025E + B01001_044E + B01001_045E + B01001_046E +
B01001_047E + B01001_048E + B01001_049E,
total_dep = round((pop_under14 + pop_over65) / (total_pop -
(pop_under14 + pop_over65)) * 100, digits = 2),
elder_dep = round(pop_over65 / (total_pop - (pop_under14 + pop_over65)) * 100, digits = 2),
bach_share = round(ifelse(edu_attain > 0, edu_bach/edu_attain, 0) * 100, digits = 2),
employ_rate = round(ifelse(total_labor > 0, employ_labor / total_labor, 0) * 100, digits = 2),
pover_rate = round(ifelse(total_pop > 0, poverty / total_pop, 0) * 100, digits = 2))
acsTractsSeattle.2015 <- acsTractsSeattle.2015 %>%
mutate(bach_dum = factor(ifelse(bach_share > mean(acsTractsSeattle.2015$bach_share,
na.rm = TRUE), "above", "below"),
levels = c("below", "above")),
median_hh_dum = factor(ifelse(median_hh_income > mean(acsTractsSeattle.2015$median_hh_income,
na.rm = TRUE), "above", "below"),
levels = c("below", "above")),
employ_dum = factor(ifelse(employ_rate > mean(acsTractsSeattle.2015$employ_rate,
na.rm = TRUE), "above", "below"),
levels = c("below", "above")),
pover_dum = factor(ifelse(pover_rate > mean(acsTractsSeattle.2015$pover_rate,
na.rm = TRUE), "above", "below"),
levels = c("below", "above")))%>%
select(GEOID, NAME, pop_den, white_share, total_dep, elder_dep, bach_share, bach_dum, median_hh_income,
median_hh_dum, employ_rate, employ_dum, pover_rate, pover_dum)
# assign census tract characteristics to house
house <- house %>%
st_join(., acsTractsSeattle.2015)
# remove NA, one house may outside the boundary of census tracts
house <- house %>%
filter(!is.na(pop_den))
# frequency of dummy variables
table(house$bach_dum)
table(house$median_hh_dum)
table(house$employ_dum)
table(house$pover_dum)
3. Amenities Services
- Subway Station (distance + category)
- School District (category)
- Park (area + count)
- Medical Facilities (distance + category)
- Commercial center (distance + count)
- Crime rate (distance + )
# subway station
sub <- sub %>%
st_transform(st_crs(house))
## the distance to the nearest station
house <- house %>%
mutate(sub_dis = nn_function(st_coordinates(house),
st_coordinates(sub), k = 1))
## categories based on distance
house <- house %>%
mutate(sub_cat = factor(case_when(
sub_dis <= 2640 ~ "within0.5mile",
sub_dis > 2640 & sub_dis <= 5280 ~ "0.5-1mile",
sub_dis > 5280 ~ "1+mile"
),levels = c("within0.5mile","0.5-1mile","1+mile")))
# school district
sch <- sch %>%
st_transform(st_crs(house))
## add the school district variable
house <- house %>%
st_join(sch%>%select(DIRDST), join = st_within)%>%
rename(sch_cat = "DIRDST")%>%
mutate(sch_cat = factor(sch_cat, levels = c("DD1","DD2","DD3","DD4","DD5","DD6","DD7")))
# park
park <- park %>%
st_transform(st_crs(house))%>%
st_intersection(seattle)%>%
mutate(park_area = st_area(.))%>%
mutate(park_area = set_units(x = park_area, value = "acres"))
## area and count of parks within 500 feet
house_parks <- st_join(st_buffer(house, dist = 500), park, join = st_intersects)%>%
group_by(key) %>%
summarise(park_c = n_distinct(SITENAME, na.rm = TRUE),
sum_park_area = sum(park_area, na.rm = TRUE))
house_parks$all_park_area <- as.numeric(house_parks$sum_park_area)
house_parks$park_cat <- as.factor(house_parks$park_c)
house <- house %>%
left_join(house_parks%>%
st_drop_geometry()%>%
select(key,park_cat, all_park_area), by = "key")%>%
rename(parks_area = all_park_area)
# tree canopy
tree_canopy_2016 <- tree_canopy_2016 %>%
st_transform(st_crs(house)) %>%
mutate(tree_canopy = round(TreeCanopy_2016_Percent, digits = 2)) %>%
select(tree_canopy)
house <- house %>%
st_join(., tree_canopy_2016)
# medical facilities
med <- med %>%
st_transform(st_crs(house))%>%
st_intersection(seattle)
## calculate the distance to the nearest medical facilities
house <- house %>%
mutate(med_dis1 = nn_function(st_coordinates(house),
st_coordinates(med), k = 1),
med_dis2 = nn_function(st_coordinates(house),
st_coordinates(med), k = 2),
med_dis3 = nn_function(st_coordinates(house),
st_coordinates(med), k = 3))
## categories based on distance
house <- house %>%
mutate(med_cat = case_when(
med_dis1 <= 2640 ~ "within0.5mile",
med_dis1 > 2640 & med_dis1 <= 5280 ~ "0.5-1mile",
med_dis1 > 5280 ~ "1+mile"))
# commercial
mark <- mark %>%
st_transform(st_crs(house))%>%
st_intersection(seattle)
## select shopping center from landmrak dataset
mark_shop <- mark%>%
filter(CODE == 690)
## calculate the distance to the nearest 1/2/3 shopping center(s)
house <- house %>%
mutate(shop_dis1 = nn_function(st_coordinates(house),
st_coordinates(mark_shop), k = 1),
shop_dis2 = nn_function(st_coordinates(house),
st_coordinates(mark_shop), k = 2),
shop_dis3 = nn_function(st_coordinates(house),
st_coordinates(mark_shop), k = 3))
## categories based on distance
house <- house %>%
mutate(shop_cat = factor(case_when(
shop_dis1 <= 2640 ~ "within0.5mile",
shop_dis1 > 2640 & shop_dis1 <= 5280 ~ "0.5-1mile",
shop_dis1 > 5280 ~ "1+mile"
),levels = c("within0.5mile","0.5-1mile","1+mile")))
# crime
# crime <- read.csv(here::here("data/raw/Amenities/SPD_Crime_Data__2008-Present_20240328.csv"))
# get the target crime
# crime_clean <- crime %>%
# mutate(year = str_sub(Report.Number, 1, 4)) %>%
# filter(year %in% c("2013", "2014", "2015") )%>% #choose those before and in 2015
# filter(!Offense %in% c(
# "Bad Checks",
# "Bribery",
# "Embezzlement",
# "Extortion/Blackmail",
# "Credit Card/Automated Teller Machine Fraud",
# "False Pretenses/Swindle/Confidence Game",
# "Identity Theft",
# "Impersonation",
# "Welfare Fraud",
# "Wire Fraud",
# "Curfew/Loitering/Vagrancy Violations",
# "Driving Under the Influence",
# "Drug Equipment Violations",
# "Drug/Narcotic Violations",
# "Betting/Wagering",
# "Gambling Equipment Violation",
# "Operating/Promoting/Assisting Gambling",
# "Liquor Law Violations",
# "Pornography/Obscene Material",
# "Assisting or Promoting Prostitution",
# "Prostitution",
# "Weapon Law Violations"
# ))%>% #exclude those with little impact on housing price e.g.Financial Crimes, Public Order Offenses
# filter(Longitude != 0 & Latitude != 0)# select the valid ones
#
# write.csv(crime_clean , here::here("data/processed/crime_clean.csv"), row.names = FALSE)
## read the cleaned data set
crime <- read.csv(here::here("data/processed/crime_clean.csv")) %>%
st_as_sf(coords = c("Longitude", "Latitude"), crs = 4326)%>%
st_transform(st_crs(house))
## count of crime within 1/8 mi
house$crime_c <- house %>%
st_buffer(660) %>%
aggregate(mutate(crime, counter = 1)%>%select(counter),., sum) %>%
pull(counter)
## calculate the distance to the nearest 1/2/3/4/5 crime locations
house <- house %>%
mutate(crime_dis1 = nn_function(st_coordinates(house),
st_coordinates(crime), k = 1),
crime_dis2 = nn_function(st_coordinates(house),
st_coordinates(crime), k = 2),
crime_dis3 = nn_function(st_coordinates(house),
st_coordinates(crime), k = 3),
crime_dis4 = nn_function(st_coordinates(house),
st_coordinates(crime), k = 4),
crime_dis5 = nn_function(st_coordinates(house),
st_coordinates(crime), k = 5))
4. Spatial Structure
- Large District
- Small Neighborhood
- Census Tract
# large District
neigh_large <- neigh_large%>%
st_transform(st_crs(house)) %>%
rename(L_NAME = L_HOOD) %>%
select(L_NAME)
# small neighborhoods
neigh_small<- neigh_small%>%
st_transform(st_crs(house)) %>%
rename(S_NAME = S_HOOD) %>%
select(S_NAME)
# census tracts
neigh_tract <- acsTractsSeattle.2015 %>%
select(NAME) %>%
rename(T_NAME = NAME) %>%
st_intersection(seattle)
# exclude outlier
house <- house%>%
filter(bedrooms<30 & crime_dis1 < 750 & bathrooms < 6)
# plot the final continuous variable
st_drop_geometry(house) %>%
select(-key)%>%
select_if(is.numeric) %>%
gather(Variable, Value, -price) %>%
ggplot(aes(Value, price)) +
geom_point(size = .5) + geom_smooth(method = "lm", se=F, colour = "#FA7800") +
facet_wrap(~Variable, ncol = 3, scales = "free") +
labs(title = "Price as a function of continuous variables") +
theme(text = element_text(size = 12), # Default text size for all text
plot.title = element_text(size = 12, face = "bold"), # Title
axis.text = element_text(size = 8), # Axis text
axis.title = element_text(size = 8), # Axis titles
strip.text = element_text(size = 8)) # Facet label text
| variables | category | description | unit | max | min | mean | st.dev. | n |
|---|---|---|---|---|---|---|---|---|
| price | dependent | Price: price of each unit | $ | 3800000 | 90000 | 589144 | 340388 | 6734 |
| year_used | internal | Year Used: years from built to 2015 | year | 115 | 0 | 62 | 35 | 6734 |
| bedrooms | internal | No.bedrooms: the number of bedrooms in each unit | # | 11 | 0 | 3 | 1 | 6734 |
| bathrooms | internal | No.bathrooms: the number of bathrooms in each unit | # | 5 | 0 | 2 | 1 | 6734 |
| sqft_living | internal | Living Area: the area of living of each unit | sqft | 7880 | 370 | 1799 | 799 | 6734 |
| sqft_lot | internal | Lot Area: the area of the lot of each unit | sqft | 91681 | 520 | 5105 | 3583 | 6734 |
| pop_den | socio-economic | Population Density: the number of population per acre in the census tract | person / acre | 76 | 1 | 12 | 7 | 6734 |
| white_share | socio-economic | White Population Share:the ratio of white people to total population in the census tract | % | 94 | 8 | 72 | 19 | 6734 |
| total_dep | socio-economic | Total Dependency Ratio: the ratio of the number of children (0-14 years old) and older persons (65 years or over) to the working-age population (15-64 years old) in the census tract | % | 73 | 3 | 39 | 12 | 6734 |
| elder_dep | socio-economic | Elderly Dependency Ratio: the ratio of older persons (65 years or over) to the working-age population (15-64 years old) in the census tract | % | 49 | 2 | 17 | 7 | 6734 |
| bach_share | socio-economic | Bachelor’s Degree Rate: the percentage of with a bachelor’s degree among adults age 25 and older in the census tract | % | 53 | 10 | 35 | 8 | 6734 |
| median_hh_income | socio-economic | Median Household Income: median househhold income in the census tract | $ | 157292 | 12269 | 82292 | 26471 | 6734 |
| employ_rate | socio-economic | Employment Rate: the ratio of the employed to the working age population in the census tract | % | 99 | 81 | 95 | 3 | 6734 |
| pover_rate | socio-economic | Poverty Rate: the ratio of the number of people (in a given age group) whose income falls below the poverty line to total population in the census tract | % | 43 | 3 | 11 | 8 | 6734 |
| sub_dis | amenities | Nearest Subway Distance: the distance to the nearest subway station | feet | 27441 | 27 | 9497 | 7439 | 6734 |
| parks_area | amenities | Parks’ Area 500ft: the total area of parks located within a 500-foot radius of each unit | acre | 553 | 0 | 13 | 45 | 6734 |
| tree_canopy | amenities | Tree Canopy Ratio: the ratio of the area of tree canopy to the total area in the measuring space | % | 89 | 5 | 29 | 9 | 6734 |
| med_dis1 | amenities | Nearest Medical Distance: the distance to the nearest medical facility | feet | 13892 | 9 | 4385 | 2558 | 6734 |
| med_dis2 | amenities | Average Distance to 2 Medicals: the average distance to the nearest 2 medical facilities | feet | 17742 | 134 | 5616 | 2923 | 6734 |
| med_dis3 | amenities | Average Distance to 3 Medicals:the average distance to the nearest 3 medical facilities | feet | 20699 | 355 | 6726 | 3691 | 6734 |
| shop_dis1 | amenities | Nearest shopping Distance:the distance to the nearest shopping center | feet | 31507 | 99 | 9019 | 5459 | 6734 |
| shop_dis2 | amenities | Average Distance to 2 Shoppings:the average distance to the nearest 2 shopping center | feet | 34370 | 1505 | 10931 | 5149 | 6734 |
| shop_dis3 | amenities | Average Distance to 3 Shoppings:the average distance to the nearest 3 shopping center | feet | 35466 | 1781 | 12166 | 5218 | 6734 |
| crime_c | amenities | No.crime: the number of crimes within a 1/8-mile radius around each unit |
|
1044 | 2 | 83 | 72 | 6734 |
| crime_dis1 | amenities | Nearest Crime Distance: the distance to the nearest crime | feet | 569 | 4 | 133 | 67 | 6734 |
| crime_dis2 | amenities | Average Distance to 2 Crimes:the average distance to the nearest 2 crime | feet | 583 | 4 | 144 | 69 | 6734 |
| crime_dis3 | amenities | Average Distance to 3 Crimes:the average distance to the nearest 3 crime | feet | 598 | 4 | 153 | 72 | 6734 |
| crime_dis4 | amenities | Average Distance to 4 Crimes:the average distance to the nearest 4 crime | feet | 617 | 4 | 162 | 75 | 6734 |
| crime_dis5 | amenities | Average Distance to 5 Crimes:the average distance to the nearest 5 crime | feet | 688 | 4 | 171 | 78 | 6734 |
# exclude useless variable
house <- house %>%
select(-med_cat)
#plot all the mean of price on each final categorical variable
house %>%
st_drop_geometry()%>%
select(-GEOID,-NAME)%>%
select(price,reno_dum, bed_cat, bath_dum,
floor_cat,water_dum, view_cat, condition_cat, grade_dum, bach_dum, median_hh_dum, employ_dum, pover_dum, sub_cat, sch_cat, park_cat, shop_cat)%>%
gather(Variable, Value, -price) %>%
ggplot(aes(Value, price)) +
geom_bar(position = "dodge", stat = "summary", fun.y = "mean") +
facet_wrap(~Variable, ncol = 3, scales = "free") +
labs(title = "Price as a function of categorical variables", y = "Mean_Price") +
theme(text = element_text(size = 12), # Default text size for all text
plot.title = element_text(size = 12, face = "bold"), # Title
axis.text = element_text(size = 8), # Axis text
axis.title = element_text(size = 8), # Axis titles
strip.text = element_text(size = 8)) # Facet label text
### reno_dum
house %>%
st_drop_geometry() %>%
group_by(reno_dum) %>%
summarise(count = n()) %>%
mutate(percent = round(count/sum(count) * 100, digits = 2),
description = c("haven't been renovated", "have been reivated"))%>%
kable(caption = "Renovation Status", table.attr = 'id="myTable"') %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = T) %>%
column_spec(1:4, extra_css = "text-align: left;")
| reno_dum | count | percent | description |
|---|---|---|---|
| 0 | 6289 | 93.39 | haven’t been renovated |
| 1 | 445 | 6.61 | have been reivated |
### bed_cat, 0-3,4-7,8+
house %>%
st_drop_geometry() %>%
group_by(bed_cat) %>%
summarise(count = n()) %>%
mutate(percent = round(count/sum(count) * 100, digits = 2))%>%
mutate(bed_cat = factor(bed_cat, levels = c("few", "medium", "many"))) %>%
arrange(bed_cat)%>%
mutate(description = c("the unit has 0-3 bedrooms",
"the unit has 4-7 bedrooms",
"the unit has more than 8 bedrooms"))%>%
kable(caption = "Category of Bedroom Count") %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = T) %>%
column_spec(1:4, extra_css = "text-align: left;")
| bed_cat | count | percent | description |
|---|---|---|---|
| few | 4695 | 69.72 | the unit has 0-3 bedrooms |
| medium | 2025 | 30.07 | the unit has 4-7 bedrooms |
| many | 14 | 0.21 | the unit has more than 8 bedrooms |
### bath_dum, 0-4, 4+
house %>%
st_drop_geometry() %>%
group_by(bath_dum) %>%
summarise(count = n()) %>%
mutate(percent = round(count/sum(count) * 100, digits = 2))%>%
mutate(bath_dum = factor(bath_dum, levels = levels(house$bath_dum))) %>%
arrange(bath_dum)%>%
mutate(description = c("the unit has 0-4 bathrooms",
"the unit has more than 4 bedrooms"))%>%
kable(caption = "Category of Bathroom Count") %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = T) %>%
column_spec(1:4, extra_css = "text-align: left;")
| bath_dum | count | percent | description |
|---|---|---|---|
| few | 6680 | 99.2 | the unit has 0-4 bathrooms |
| many | 54 | 0.8 | the unit has more than 4 bedrooms |
### floor_cat, 1-2+3, 2.5+3.5
house %>%
st_drop_geometry() %>%
group_by(floor_cat) %>%
summarise(count = n()) %>%
mutate(percent = round(count/sum(count) * 100, digits = 2))%>%
mutate(floor_cat = factor(floor_cat, levels = levels(house$floor_cat))) %>%
arrange(floor_cat)%>%
mutate(description = c("the unit has 2.5/3.5 floors",
"the unit has 1/1.5/2/3 floors"))%>%
kable(caption = "Category by Floors") %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = T) %>%
column_spec(1:4, extra_css = "text-align: left;")
| floor_cat | count | percent | description |
|---|---|---|---|
| irregular | 104 | 1.54 | the unit has 2.5/3.5 floors |
| regular | 6630 | 98.46 | the unit has 1/1.5/2/3 floors |
### water_dum
house %>%
st_drop_geometry() %>%
group_by(water_dum) %>%
summarise(count = n()) %>%
mutate(percent = round(count/sum(count) * 100, digits = 2),
description = c("the unit isn't located at waterfront area",
"the unit is located at waterfront area"))%>%
kable(caption = "Waterfront Factor") %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = T) %>%
column_spec(1:4, extra_css = "text-align: left;")
| water_dum | count | percent | description |
|---|---|---|---|
| 0 | 6705 | 99.57 | the unit isn’t located at waterfront area |
| 1 | 29 | 0.43 | the unit is located at waterfront area |
### view_cat, 0,1,2,3,4
house %>%
st_drop_geometry() %>%
group_by(view_cat) %>%
summarise(count = n()) %>%
mutate(percent = round(count/sum(count) * 100, digits = 2))%>%
mutate(view_cat = factor(view_cat, levels = levels(house$view_cat))) %>%
arrange(view_cat)%>%
mutate(description = c("the unit has a view scoring 0/4",
"the unit has a view scoring 1/4",
"the unit has a view scoring 2/4",
"the unit has a view scoring 3/4",
"the unit has a view scoring 4/4" ))%>%
kable(caption = "View Quality") %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = T) %>%
column_spec(1:4, extra_css = "text-align: left;")
| view_cat | count | percent | description |
|---|---|---|---|
| 0 | 5880 | 87.32 | the unit has a view scoring 0/4 |
| 1 | 143 | 2.12 | the unit has a view scoring 1/4 |
| 2 | 407 | 6.04 | the unit has a view scoring 2/4 |
| 3 | 202 | 3.00 | the unit has a view scoring 3/4 |
| 4 | 102 | 1.51 | the unit has a view scoring 4/4 |
### condition_cat
house %>%
st_drop_geometry() %>%
group_by(condition_cat) %>%
summarise(count = n()) %>%
mutate(percent = round(count/sum(count) * 100, digits = 2))%>%
mutate(condition_cat = factor(condition_cat, levels = levels(house$condition_cat))) %>%
arrange(condition_cat)%>%
mutate(description = c("the unit's condition scores 1/5",
"the unit's condition scores 2/5",
"the unit's condition scores 3/5",
"the unit's condition scores 4/5",
"the unit's condition scores 5/5"))%>%
kable(caption = "Condition Level") %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = T) %>%
column_spec(1:4, extra_css = "text-align: left;")
| condition_cat | count | percent | description |
|---|---|---|---|
| 1 | 12 | 0.18 | the unit’s condition scores 1/5 |
| 2 | 57 | 0.85 | the unit’s condition scores 2/5 |
| 3 | 4324 | 64.21 | the unit’s condition scores 3/5 |
| 4 | 1566 | 23.26 | the unit’s condition scores 4/5 |
| 5 | 775 | 11.51 | the unit’s condition scores 5/5 |
### grade_dum
house %>%
st_drop_geometry() %>%
group_by(grade_dum) %>%
summarise(count = n()) %>%
mutate(percent = round(count/sum(count) * 100, digits = 2))%>%
mutate(grade_dum = factor(grade_dum, levels = levels(house$grade_dum))) %>%
arrange(grade_dum)%>%
mutate(description = c("the unit's grade is 4-9",
"the unit's grade is 10-13"))%>%
kable(caption = "Grade Level") %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = T) %>%
column_spec(1:4, extra_css = "text-align: left;")
| grade_dum | count | percent | description |
|---|---|---|---|
| low | 6476 | 96.17 | the unit’s grade is 4-9 |
| high | 258 | 3.83 | the unit’s grade is 10-13 |
### bach_dum
house %>%
st_drop_geometry() %>%
group_by(bach_dum) %>%
summarise(count = n()) %>%
mutate(percent = round(count/sum(count) * 100, digits = 2))%>%
mutate(bach_dum = factor(bach_dum, levels = levels(house$bach_dum))) %>%
arrange(bach_dum)%>%
mutate(description = c("the unit is in a census tract with a bachelor's degree rate below the Seattle average",
"the unit is in a census tract with a bachelor's degree rate above the Seattle average"))%>%
kable(caption = "Bachelor's Degree Rate Level") %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = T) %>%
column_spec(1:4, extra_css = "text-align: left;")
| bach_dum | count | percent | description |
|---|---|---|---|
| below | 1462 | 21.71 | the unit is in a census tract with a bachelor’s degree rate below the Seattle average |
| above | 5272 | 78.29 | the unit is in a census tract with a bachelor’s degree rate above the Seattle average |
### median_hh_dum
house %>%
st_drop_geometry() %>%
group_by(median_hh_dum) %>%
summarise(count = n()) %>%
mutate(percent = round(count/sum(count) * 100, digits = 2))%>%
mutate(median_hh_dum = factor(median_hh_dum, levels = levels(house$median_hh_dum))) %>%
arrange(median_hh_dum)%>%
mutate(description = c("the unit is in a census tract with a median household income below the Seattle average",
"the unit is in a census tract with a median household income above the Seattle average"))%>%
kable(caption = "Median Household Income Level") %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = T) %>%
column_spec(1:4, extra_css = "text-align: left;")
| median_hh_dum | count | percent | description |
|---|---|---|---|
| below | 3440 | 51.08 | the unit is in a census tract with a median household income below the Seattle average |
| above | 3294 | 48.92 | the unit is in a census tract with a median household income above the Seattle average |
### employ_dum
house %>%
st_drop_geometry() %>%
group_by(employ_dum) %>%
summarise(count = n()) %>%
mutate(percent = round(count/sum(count) * 100, digits = 2))%>%
mutate(employ_dum = factor(employ_dum, levels = levels(house$employ_dum))) %>%
arrange(employ_dum)%>%
mutate(description = c("the unit is in a census tract with a employment rate below the Seattle average",
"the unit is in a census tract with a employment rate above the Seattle average"))%>%
kable(caption = "Employment Rate Level") %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = T) %>%
column_spec(1:4, extra_css = "text-align: left;")
| employ_dum | count | percent | description |
|---|---|---|---|
| below | 1517 | 22.53 | the unit is in a census tract with a employment rate below the Seattle average |
| above | 5217 | 77.47 | the unit is in a census tract with a employment rate above the Seattle average |
### pover_dum
house %>%
st_drop_geometry() %>%
group_by(pover_dum) %>%
summarise(count = n()) %>%
mutate(percent = round(count/sum(count) * 100, digits = 2))%>%
mutate(pover_dum = factor(pover_dum, levels = levels(house$pover_dum))) %>%
arrange(pover_dum)%>%
mutate(description = c("the unit is in a census tract with a poverty rate below the Seattle average",
"the unit is in a census tract with a poverty rate above the Seattle average"))%>%
kable(caption = "Poverty Rate Level") %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = T) %>%
column_spec(1:4, extra_css = "text-align: left;")
| pover_dum | count | percent | description |
|---|---|---|---|
| below | 4366 | 64.84 | the unit is in a census tract with a poverty rate below the Seattle average |
| above | 2368 | 35.16 | the unit is in a census tract with a poverty rate above the Seattle average |
### sub_cat
house %>%
st_drop_geometry() %>%
group_by(sub_cat) %>%
summarise(count = n()) %>%
mutate(percent = round(count/sum(count) * 100, digits = 2))%>%
mutate(sub_cat = factor(sub_cat, levels = levels(house$sub_cat))) %>%
arrange(sub_cat)%>%
mutate(description = c("the unit is within a 0.5-mile radius of a subway station",
"the unit is within a 0.5-mile to 1-mile radius of a subway station",
"the unit is beyond a 1-mile radius of a subway station"))%>%
kable(caption = "Category by Subway Distance") %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = T) %>%
column_spec(1:4, extra_css = "text-align: left;")
| sub_cat | count | percent | description |
|---|---|---|---|
| within0.5mile | 1505 | 22.35 | the unit is within a 0.5-mile radius of a subway station |
| 0.5-1mile | 1388 | 20.61 | the unit is within a 0.5-mile to 1-mile radius of a subway station |
| 1+mile | 3841 | 57.04 | the unit is beyond a 1-mile radius of a subway station |
### sch_cat
house %>%
st_drop_geometry() %>%
group_by(sch_cat) %>%
summarise(count = n()) %>%
mutate(percent = round(count/sum(count) * 100, digits = 2))%>%
mutate(sch_cat = factor(sch_cat, levels = c("DD1","DD2","DD3","DD4","DD5","DD6","DD7"))) %>%
arrange(sch_cat)%>%
mutate(description = c("the unit is in school district one",
"the unit is in school district two",
"the unit is in school district three",
"the unit is in school district four",
"the unit is in school district five",
"the unit is in school district six",
"the unit is in school district seven"))%>%
kable(caption = "School District") %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = T) %>%
column_spec(1:4, extra_css = "text-align: left;")
| sch_cat | count | percent | description |
|---|---|---|---|
| DD1 | 1172 | 17.40 | the unit is in school district one |
| DD2 | 1310 | 19.45 | the unit is in school district two |
| DD3 | 758 | 11.26 | the unit is in school district three |
| DD4 | 382 | 5.67 | the unit is in school district four |
| DD5 | 808 | 12.00 | the unit is in school district five |
| DD6 | 1369 | 20.33 | the unit is in school district six |
| DD7 | 935 | 13.88 | the unit is in school district seven |
### park_cat
house %>%
st_drop_geometry() %>%
group_by(park_cat) %>%
summarise(count = n()) %>%
mutate(percent = round(count/sum(count) * 100, digits = 2))%>%
mutate(park_cat = factor(park_cat, levels = levels(house$park_cat))) %>%
arrange(park_cat)%>%
mutate(description = c("the unit is beyond a 500-feet radius of any park",
"the unit is within a 500-feet radius of one park",
"the unit is within a 500-feet radius of two parks",
"the unit is within a 500-feet radius of three park",
"the unit is within a 500-feet radius of four park"))%>%
kable(caption = "Number of nearby Parks") %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = T) %>%
column_spec(1:4, extra_css = "text-align: left;")
| park_cat | count | percent | description |
|---|---|---|---|
| 0 | 4679 | 69.48 | the unit is beyond a 500-feet radius of any park |
| 1 | 1651 | 24.52 | the unit is within a 500-feet radius of one park |
| 2 | 327 | 4.86 | the unit is within a 500-feet radius of two parks |
| 3 | 67 | 0.99 | the unit is within a 500-feet radius of three park |
| 4 | 10 | 0.15 | the unit is within a 500-feet radius of four park |
### shop_cat
house %>%
st_drop_geometry() %>%
group_by(shop_cat) %>%
summarise(count = n()) %>%
mutate(percent = round(count/sum(count) * 100, digits = 2))%>%
mutate(shop_cat = factor(shop_cat, levels = levels(house$shop_cat))) %>%
arrange(shop_cat)%>%
mutate(description = c("the unit is within a 0.5-mile radius of a shopping center",
"the unit is within a 0.5-mile to 1-mile radius of a shopping center",
"the unit is beyond a 1-mile radius of a shopping center"))%>%
kable(caption = "Category by Shopping Center Distance") %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = T) %>%
column_spec(1:4, extra_css = "text-align: left;")
| shop_cat | count | percent | description |
|---|---|---|---|
| within0.5mile | 464 | 6.89 | the unit is within a 0.5-mile radius of a shopping center |
| 0.5-1mile | 1245 | 18.49 | the unit is within a 0.5-mile to 1-mile radius of a shopping center |
| 1+mile | 5025 | 74.62 | the unit is beyond a 1-mile radius of a shopping center |
# Select only numeric variables and remove rows with missing values
house_numeric <- house %>%
st_drop_geometry() %>% # Remove geometry column if present
select(-key) %>% # delete unrelative column
select_if(is.numeric) %>% # Select only numeric variables
na.omit()%>% # Remove rows with missing values
setNames(c("Price","Year Used", "No.bedrooms","No.bathrooms",
"Living Area","Lot Area","Population Density","White Population Share",
"Total Dependency Ratio","Elderly Dependency Ratio","Bachelor's Degree Rate","Median Household Income",
"Employment Rate","Poverty Rate","Nearest Subway Distance","Parks' Area 500ft","Tree Canopy Ratio",
"Nearest Medical Distance","Average Distance to 2 Medicals","Average Distance to 3 Medicals",
"Nearest Shopping Distance","Average Distance to 2 Shoppings","Average Distance to 3 Shoppings",
"No.crime","Nearest Crime Distance","Average Distance to 2 Crimes","Average Distance to 3 Crimes","Average Distance to 4 Crimes","Average Distance to 5 Crimes"))
# Calculate correlation matrix
correlation_matrix <- cor(house_numeric)
#plot the correlation plot using the corrr library
house_numeric %>%
corrr::correlate() %>%
autoplot() +
geom_text(aes(label = round(r,digits=2)),size = 1)
Living Area Square Feet
ggplot(house) +
geom_point(aes(x = sqft_living, y = price), color = "black", pch = 16, size = 1.6) +
labs(title = "Seattle House Price vs. Living Area Square Feet",
x = "Living Sqft",
y = "House Price") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, face = "bold"))
Median Household Income (Census Tract)
ggplot(house) +
geom_point(aes(x = median_hh_income, y = price), color = "black", pch = 16, size = 1.6) +
labs(title = "Seattle House Price vs. Median Household Income (Census Tract)",
x = "Income",
y = "House Price") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, face = "bold"))
Distance to the Nearest Shopping Center
ggplot(house) +
geom_point(aes(x = shop_dis1, y = price), color = "black", pch = 16, size = 1.6) +
labs(title = "Seattle House Price vs. Distance to the Nearest Shopping Center",
x = "Distance",
y = "House Price") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, face = "bold"))
Crime Count within 1/8 mile of Each House
ggplot(house) +
geom_point(aes(x = crime_c, y = price), color = "black", pch = 16, size = 1.6) +
labs(title = "Seattle House Price vs. Crime Count within 1/8 mile of Each House",
x = "Crime Count",
y = "House Price") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, face = "bold"))
# quantile break and color palette
breaks_quantiles <- classIntervals(house$price, n = 5, style = "quantile")
colors <- brewer.pal(n = 5, name = "YlOrRd")
labels <- paste0(formatC(breaks_quantiles$brks[-length(breaks_quantiles$brks)], format = "f", digits = 0, big.mark = ","),
" - ",
formatC(breaks_quantiles$brks[-1], format = "f", digits = 0, big.mark = ","))
# plot house price
ggplot() +
geom_sf(data = seattle, fill = "#ECECEC", color = "#2166ac", linewidth = 0.3) +
geom_sf(data = house,
aes(color = cut(price, breaks = breaks_quantiles$brks, include.lowest = TRUE)), size = 0.3) +
scale_color_manual(values = colors,
labels = labels,
name = "House Price (Quantile)") +
labs(title = "House Price in Seattle, 2015") +
theme_void() +
theme(plot.title = element_text(hjust = 0.5, face = "bold"))
Lot Square Feet
# quantile break and color palette
breaks_quantiles <- classIntervals(house$sqft_lot, n = 5, style = "quantile")
colors <- brewer.pal(n = 5, name = "YlOrRd")
labels <- paste0(formatC(breaks_quantiles$brks[-length(breaks_quantiles$brks)], format = "f", digits = 0, big.mark = ","),
" - ",
formatC(breaks_quantiles$brks[-1], format = "f", digits = 0, big.mark = ","))
# plot lot square feet
ggplot() +
geom_sf(data = seattle, fill = "#ECECEC", color = "#2166ac", linewidth = 0.3) +
geom_sf(data = house, aes(color = cut(sqft_lot, breaks = breaks_quantiles$brks,
include.lowest = TRUE)), size = 0.3) +
scale_color_manual(values = colors,
labels = labels,
name = "Lot Square Feet (Quantile)") +
labs(title = "Lot Square Feet of Houses in Seattle, 2015") +
theme_void() +
theme(plot.title = element_text(hjust = 0.5, face = "bold"))
School District
# color palette
colors <- brewer.pal(n = 7, name = "Set3")
labels <- c("District One", "District Two", "District Three", "District Four",
"District Five", "District Six", "District Seven")
# plot school district
ggplot() +
geom_sf(data = sch, fill = "#ECECEC", color = "#2166ac", linewidth = 0.3) +
geom_sf(data = house, aes(color = sch_cat), size = 0.3) +
scale_color_manual(values = colors,
labels = labels,
name = "School District") +
labs(title = "School Districts in Seattle, 2015") +
theme_void() +
theme(plot.title = element_text(hjust = 0.5, face = "bold"))
White Population Share
# quantile break and color palette
breaks_quantiles <- classIntervals(acsTractsSeattle.2015$white_share, n = 5, style = "quantile")
colors <- brewer.pal(n = 5, name = "Blues")
labels <- paste0(formatC(breaks_quantiles$brks[-length(breaks_quantiles$brks)], format = "f", digits = 0, big.mark = ","),
" - ",
formatC(breaks_quantiles$brks[-1], format = "f", digits = 0, big.mark = ","))
# plot white population share
ggplot() +
geom_sf(data = st_intersection(acsTractsSeattle.2015, seattle),
aes(fill = cut(white_share, breaks = breaks_quantiles$brks, include.lowest = TRUE)),
color = "#ECECEC") +
scale_fill_manual(values = colors,
labels = labels,
name = "White Share (Quantile)") +
labs(title = "White Population Share of Census Tracts in Seattle, 2015") +
theme_void() +
theme(plot.title = element_text(hjust = 0.5, face = "bold"))
# finalize regression dataset
house <- house %>%
select(-GEOID, -NAME) %>%
st_drop_geometry()
# select continuous or categorical variable
lm1 <- lm(price ~ .-key, data = house)
lm1.sum <- summary(lm1)
## bedroom
lm2 <- lm(price ~ .-bed_cat-key, data = house)
lm2.2 <- lm(price ~ .-bedrooms-key, data = house)
stargazer(lm2,lm2.2, type="text")
house <- house %>%
select(-bed_cat)
## bathroom
lm3 <- lm(price ~ .-bathrooms-key, data = house)
lm3.2 <- lm(price ~ .-bath_dum-key, data = house)
stargazer(lm3,lm3.2, type="text")
house <- house %>%
select(-bathrooms)
## dependency
lm4 <- lm(price ~ .-total_dep-key, data = house)
lm4.2 <- lm(price ~ .-elder_dep-key, data = house)
stargazer(lm4,lm4.2, type="text")
house <- house %>%
select(-total_dep)
## bachelor degree
lm5 <- lm(price ~ .-bach_dum-key, data = house)
lm5.2 <- lm(price ~ .-bach_share-key, data = house)
stargazer(lm5,lm5.2, type="text")
house <- house %>%
select(-bach_dum)
## median household income
lm6 <- lm(price ~ .-median_hh_dum-key, data = house)
lm6.2 <- lm(price ~ .-median_hh_income-key, data = house)
stargazer(lm6,lm6.2, type="text")
house <- house %>%
select(-median_hh_dum)
## employment
lm7 <- lm(price ~ .-employ_dum-key, data = house)
lm7.2 <- lm(price ~ .-employ_rate-key, data = house)
stargazer(lm7,lm7.2, type="text")
house <- house %>%
select(-employ_dum)
## poverty
lm8 <- lm(price ~ .-pover_dum-key, data = house)
lm8.2 <- lm(price ~ .-pover_rate-key, data = house)
stargazer(lm8,lm8.2, type="text")
house <- house %>%
select(-pover_dum)
## subway station
lm9 <- lm(price ~ .-sub_cat-key, data = house)
lm9.2 <- lm(price ~ .-sub_dis-key, data = house)
stargazer(lm9,lm9.2, type="text")
house <- house %>%
select(-sub_cat)
## park
lm10 <- lm(price ~ .-parks_area-key, data = house)
lm10.2 <- lm(price ~ .-park_cat-key, data = house)
stargazer(lm10,lm10.2, type="text")
house <- house %>%
select(-parks_area)
## medical facility
lm11 <- lm(price ~ .-med_dis2-med_dis3-key, data = house)
lm11.2 <- lm(price ~ .-med_dis1-med_dis2-key, data = house)
lm11.3 <- lm(price ~ .-med_dis1-med_dis3-key, data = house)
stargazer(lm11,lm11.2,lm11.3, type="text")
house <- house %>%
select(-med_dis2, -med_dis3)
## shopping center
lm12 <- lm(price ~ .-shop_dis2-shop_dis3-shop_cat-key, data = house)
lm12.2 <- lm(price ~ .-shop_dis1-shop_dis2-shop_cat-key, data = house)
lm12.3 <- lm(price ~ .-shop_dis1-shop_dis3-shop_cat-key, data = house)
lm12.4 <- lm(price ~ .-shop_dis1-shop_dis2-shop_dis3-key, data = house)
stargazer(lm12,lm12.2,lm12.3,lm12.4, type="text")
house <- house %>%
select(-shop_dis2, -shop_dis3,-shop_cat)
## crime
lm13 <- lm(price ~ .-crime_dis1-crime_dis2-crime_dis3-crime_dis4-crime_dis5-key, data = house)
lm13.2 <- lm(price ~ .-crime_dis2-crime_dis3-crime_dis4-crime_dis5-crime_c-key, data = house)
lm13.3 <- lm(price ~ .-crime_dis1-crime_dis3-crime_dis4-crime_dis5-crime_c-key, data = house)
lm13.4 <- lm(price ~ .-crime_dis2-crime_dis1-crime_dis4-crime_dis5-crime_c-key, data = house)
lm13.5 <- lm(price ~ .-crime_dis2-crime_dis3-crime_dis1-crime_dis5-crime_c-key, data = house)
lm13.6 <- lm(price ~ .-crime_dis2-crime_dis3-crime_dis4-crime_dis1-crime_c-key, data = house)
stargazer(lm13,lm13.2,lm13.3,lm13.4,lm13.5,lm13.6, type="text")
house <- house %>%
select(-crime_dis1,-crime_dis2, -crime_dis3, -crime_dis4, -crime_dis5)
## add fixed effect
house_lm <- house %>%
left_join(hh%>%select(key), by = "key") %>%
st_as_sf() %>%
st_join(., neigh_large) %>%
st_join(., neigh_small) %>%
st_join(., neigh_tract) %>%
st_drop_geometry() %>%
select(-key)
str(house_lm)
# split data 0.7/0.3
set.seed(1)
inTrain <- createDataPartition(
y = paste(house_lm$reno_dum, house_lm$bath_dum, house_lm$floor_cat,
house_lm$water_dum, house_lm$view_cat, house_lm$condition_cat,
house_lm$grade_dum, house_lm$sch_cat, house_lm$park_cat, house_lm$L_NAME),
p = .70, list = FALSE) # create a vector for the training set (70%)
# subset the dataset to create the training set
seattle.train.lm <- house_lm[inTrain,] # training set
# subset the dataset to create the testing set
seattle.test.lm <- house_lm[-inTrain,] # testing set
rbind(seattle.train.lm%>%mutate(dataset = "training"),
seattle.test.lm %>% mutate(dataset = "testing"))%>%
group_by(dataset)%>%
summarise(count = n())%>%
mutate(percent = round(count/sum(count) * 100, digits = 2))%>%
kable() %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = T) %>%
column_spec(1:3, extra_css = "text-align: left;")
| dataset | count | percent |
|---|---|---|
| testing | 1663 | 24.7 |
| training | 5071 | 75.3 |
# ignore neighborhood effect first
seattle.train <- seattle.train.lm %>%
select(-L_NAME, -S_NAME, -T_NAME)
seattle.test <- seattle.test.lm %>%
select(-L_NAME, -S_NAME, -T_NAME)
# build regression model on training dataset
lm14 <- lm(price ~ ., data = seattle.train)
summary(lm14)
vif(lm14) #sub_dis = 2.664951
## cor on continuous variable
cor.multi <- seattle.train %>%
select_if(is.numeric)%>%
corrr::correlate() %>%
autoplot() +
geom_text(aes(label = round(r,digits=2)),size = 1)
## result:
## sub_dis have low correlation with other numeric variables
## high correlation: white share & bachelor degree
## high correlation: white share & poverty rate
## lm on sub_dis
lm.sub <- lm(sub_dis ~ ., data = seattle.train)
summary(lm.sub) # R^2 = 0.8597 -> highly correlated with other category variables
vif(lm.sub)
lm15 <- lm(price ~ .-sub_dis, data = seattle.train)
stargazer(lm14,lm15, type = "text") #R^2: 0.805->0.803
vif(lm15) #white_share = 2.452959
seattle.train <- seattle.train %>%
select(-sub_dis)
## high correlation: white share & bachelor degree
lm16 <- lm(price ~ .-bach_share, data = seattle.train)
lm16.2 <- lm(price ~ .-white_share, data = seattle.train)
stargazer(lm16, lm16.2, type="text")
vif(lm16)
seattle.train <- seattle.train %>%
select(-bach_share)
## high correlation: white share & poverty rate
lm17 <- lm(price ~ .-pover_rate, data = seattle.train)
lm17.2 <- lm(price ~ .-white_share, data = seattle.train)
stargazer(lm17, lm17.2, type="text")
vif(lm17)
seattle.train <- seattle.train %>%
select(-pover_rate)
## insignificance: tree canopy
lm18 <- lm(price ~ .-tree_canopy, data = seattle.train)
summary(lm18)
vif(lm18)
seattle.train <- seattle.train %>%
select(-tree_canopy)
# make predictions on the test set and evaluate model performance
lm19 <- lm(price ~ ., data = seattle.train)
summary(lm19)
# adjusted R-squared = 0.8013
## first regression model from last step
seattle.test1 <-
seattle.test %>%
mutate(regression = "baseline regression", # add a column indicating the type of regression model used
price.predict = predict(lm19, seattle.test), # predict house prices using the trained regression model
# calculate the difference between predicted and actual house prices
price.error = price.predict - price,
# calculate the absolute difference between predicted and actual house prices
price.AbsError = abs(price.predict - price),
# calculate the absolute percentage error
price.APE = round((abs(price.predict - price)) / price * 100, digits = 2))
mean(seattle.test1$price.AbsError) # 93754.08
mean(seattle.test1$price.APE) # 18.97%
## "year_used" p = 0.051837 -> remove
lm20 <- lm(price ~ .-year_used, data = seattle.train)
seattle.test2 <-
seattle.test %>%
mutate(regression = "baseline regression", # add a column indicating the type of regression model used
price.predict = predict(lm20, seattle.test), # predict house prices using the trained regression model
# calculate the difference between predicted and actual house prices
price.error = price.predict - price,
# calculate the absolute difference between predicted and actual house prices
price.AbsError = abs(price.predict - price),
# calculate the absolute percentage error
price.APE = round((abs(price.predict - price)) / price * 100, digits = 2))
summary(lm20) # adjusted R-squared = 0.8012 (-0.0001)
mean(seattle.test2$price.AbsError) # 93637.18 (-116.9)
mean(seattle.test2$price.APE) # 18.93% (-0.04%)
#-> accuracy worse but generalizability better
seattle.train <- seattle.train %>%
select(-year_used)
## "crime_c" p = 0.002917 -> remove
lm21 <- lm(price ~ .-crime_c, data = seattle.train)
seattle.test3 <-
seattle.test %>%
mutate(regression = "baseline regression", # add a column indicating the type of regression model used
price.predict = predict(lm21, seattle.test), # predict house prices using the trained regression model
# calculate the difference between predicted and actual house prices
price.error = price.predict - price,
# calculate the absolute difference between predicted and actual house prices
price.AbsError = abs(price.predict - price),
# calculate the absolute percentage error
price.APE = round((abs(price.predict - price)) / price * 100, digits = 2))
summary(lm21) # adjusted R-squared = 0.8009 (-0.003)
mean(seattle.test3$price.AbsError) # 93507.13 (-130.05)
mean(seattle.test3$price.APE) # 18.89% (-0.04)
#-> accuracy worse but generalizability better
seattle.train <- seattle.train %>%
select(-crime_c)
## "employ_rate" -> remove
lm22 <- lm(price ~ .-employ_rate, data = seattle.train)
seattle.test4 <-
seattle.test %>%
mutate(regression = "baseline regression", # add a column indicating the type of regression model used
price.predict = predict(lm22, seattle.test), # predict house prices using the trained regression model
# calculate the difference between predicted and actual house prices
price.error = price.predict - price,
# calculate the absolute difference between predicted and actual house prices
price.AbsError = abs(price.predict - price),
# calculate the absolute percentage error
price.APE = round((abs(price.predict - price)) / price * 100, digits = 2))
summary(lm22) # adjusted R-squared = 0.8004 (-0.0005)
mean(seattle.test4$price.AbsError) # 93339.94 (-167.19)
mean(seattle.test4$price.APE) # 18.85% (-0.04%)
#-> accuracy worse but generalizability better
seattle.train <- seattle.train %>%
select(-employ_rate)
## "med_dis1" -> remove
lm23 <- lm(price ~ .-med_dis1, data = seattle.train)
seattle.test5 <-
seattle.test %>%
mutate(regression = "baseline regression", # add a column indicating the type of regression model used
price.predict = predict(lm23, seattle.test), # predict house prices using the trained regression model
# calculate the difference between predicted and actual house prices
price.error = price.predict - price,
# calculate the absolute difference between predicted and actual house prices
price.AbsError = abs(price.predict - price),
# calculate the absolute percentage error
price.APE = round((abs(price.predict - price)) / price * 100, digits = 2))
summary(lm23) # adjusted R-squared = 0.7995 (-0.0009)
mean(seattle.test5$price.AbsError) # 93014.7 (-325.24)
mean(seattle.test5$price.APE) # 18.72% (-0.13%)
#-> accuracy worse but generalizability better
seattle.train <- seattle.train %>%
select(-med_dis1)
## "elder_dep" -> remove
lm24 <- lm(price ~ .-elder_dep, data = seattle.train)
seattle.test6 <-
seattle.test %>%
mutate(regression = "baseline regression", # add a column indicating the type of regression model used
price.predict = predict(lm24, seattle.test), # predict house prices using the trained regression model
# calculate the difference between predicted and actual house prices
price.error = price.predict - price,
# calculate the absolute difference between predicted and actual house prices
price.AbsError = abs(price.predict - price),
# calculate the absolute percentage error
price.APE = round((abs(price.predict - price)) / price * 100, digits = 2))
summary(lm24) # adjusted R-squared = 0.7962 (-0.0033)
mean(seattle.test6$price.AbsError) # 92474.93 (-539.77)
mean(seattle.test6$price.APE) # 18.50% (-0.22%)
#-> accuracy worse but generalizability better
seattle.train <- seattle.train %>%
select(-elder_dep)
## "pop_den" -> remove
lm25 <- lm(price ~ .-pop_den, data = seattle.train)
seattle.test7 <-
seattle.test %>%
mutate(regression = "baseline regression", # add a column indicating the type of regression model used
price.predict = predict(lm25, seattle.test), # predict house prices using the trained regression model
# calculate the difference between predicted and actual house prices
price.error = price.predict - price,
# calculate the absolute difference between predicted and actual house prices
price.AbsError = abs(price.predict - price),
# calculate the absolute percentage error
price.APE = round((abs(price.predict - price)) / price * 100, digits = 2))
summary(lm25) # adjusted R-squared = 0.7963 (-0.0001)
mean(seattle.test7$price.AbsError) # 92440.65 (-34.28)
mean(seattle.test7$price.APE) # 18.49% (-0.01)
#-> accuracy worse but generalizability better
seattle.train <- seattle.train %>%
select(-pop_den)
## "reno_dum","bedrooms","bath_dum", "sqft_living", "sqft_lot", "floor_cat", "water_dum", "view_cat", "condition_cat", "grade_dum", "white_share", "median_hh_income", "sch_cat", "park_cat", "shop_dis1" -> keep
#-> both worse
# spatial lag of price
house.sf <- hh %>%
select(geometry,key)%>%
right_join(house, by = "key")
coords <- st_coordinates(house.sf)
neighborList <- knn2nb(knearneigh(coords, 5))
spatialWeights <- nb2listw(neighborList, style="W")
house.sf %>%
mutate(lagPrice = lag.listw(spatialWeights, price))%>%
ggplot()+
geom_point(aes(x = lagPrice, y = price), color = "black", pch = 16, size = 1.6)+
stat_smooth(aes(lagPrice, price),
method = "lm", se = FALSE, size = 1, color="#b2182b")+
labs(title="Price as a Function of the Spatial Lag of Price") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, face = "bold"))
# spatial lag of price error
house.test.sf <- hh %>%
select(geometry,key)%>%
right_join(house[-inTrain,], by = "key")
coords.test <- sf::st_coordinates(house.test.sf)
neighborList.test <- knn2nb(knearneigh(coords.test, 5))
spatialWeights.test <- nb2listw(neighborList.test, style="W")
seattle.test7 %>%
mutate(lagPriceError = lag.listw(spatialWeights.test, price.error)) %>%
ggplot()+
geom_point(aes(x = lagPriceError, y = price.error), color = "black", pch = 16, size = 1.6)+
stat_smooth(aes(lagPriceError, price.error),
method = "lm", se = FALSE, size = 1, color="#b2182b")+
labs(title="Error as a Function of the Spatial Lag of Error")+
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, face = "bold"))
# moran's I test
moranTest <- moran.mc(seattle.test7$price.error,
spatialWeights.test, nsim = 999)
#p-value = 0.001, pattern is slight 0.09
ggplot(as.data.frame(moranTest$res[c(1:999)]), aes(moranTest$res[c(1:999)])) +
geom_histogram(binwidth = 0.01) +
geom_vline(aes(xintercept = moranTest$statistic), colour = "#FA7800",size=1) +
scale_x_continuous(limits = c(-1, 1)) +
labs(title="Observed and permuted Moran's I",
subtitle= "Observed Moran's I in orange",
x="Moran's I",
y="Count") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, face = "bold"),
plot.subtitle = element_text(hjust = 0.5))
moranTest.stats <- moranTest$statistic # 0.2198671
# without fixed effect
set.seed(1)
fitControl <- trainControl(method = "cv", number = 100)
## "reno_dum","bedrooms","bath_dum", "sqft_living", "sqft_lot", "floor_cat", "water_dum", "view_cat", "condition_cat", "grade_dum", "white_share", "median_hh_income", "sch_cat", "park_cat", "shop_dis1"
seattle.cv <-
train(price ~ ., data = house_lm %>% select(price, reno_dum, bedrooms, bath_dum,
sqft_living, sqft_lot, floor_cat,
water_dum, view_cat, condition_cat,
grade_dum, white_share, median_hh_income, sch_cat, park_cat, shop_dis1),
method = "lm", trControl = fitControl, na.action = na.pass)
### Large: L_NAME
seattle.neighL.cv <-
train(price ~ ., data = house_lm %>% select(price, reno_dum, bedrooms, bath_dum,
sqft_living, sqft_lot, floor_cat,
water_dum, view_cat, condition_cat,
grade_dum, white_share, median_hh_income, sch_cat, park_cat, shop_dis1, L_NAME),
method = "lm", trControl = fitControl, na.action = na.pass)
### Small: S_NAME
seattle.neighS.cv <-
train(price ~ ., data = house_lm %>% select(price, reno_dum, bedrooms, bath_dum,
sqft_living, sqft_lot, floor_cat,
water_dum, view_cat, condition_cat,
grade_dum, white_share, median_hh_income, sch_cat, park_cat, shop_dis1, S_NAME),
method = "lm", trControl = fitControl, na.action = na.pass)
### tract: T_NAME
seattle.neighT.cv <-
train(price ~ ., data = house_lm %>% select(price, reno_dum, bedrooms, bath_dum,
sqft_living, sqft_lot, floor_cat,
water_dum, view_cat, condition_cat,
grade_dum, white_share, median_hh_income, sch_cat, park_cat, shop_dis1, T_NAME),
method = "lm", trControl = fitControl, na.action = na.pass)
#-> census tract as the fixed effect improves the model most
data.frame(rbind(seattle.cv$results,seattle.neighL.cv$results, seattle.neighS.cv$results,seattle.neighT.cv$results))%>%
mutate(intercept = c("baseline","fix_effect_large_district", "fix_effect_small_neighborhood", "fix_effect_census_tract"))%>%
rename(model = "intercept")%>%
select(-RMSESD, -RsquaredSD, -MAESD) %>%
kable()%>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = T) %>%
column_spec(1:4, extra_css = "text-align: left;")
| model | RMSE | Rsquared | MAE |
|---|---|---|---|
| baseline | 156295.9 | 0.7848814 | 105016.78 |
| fix_effect_large_district | 153985.8 | 0.7890730 | 102215.99 |
| fix_effect_small_neighborhood | 143735.5 | 0.8130965 | 95447.55 |
| fix_effect_census_tract | 142551.1 | 0.8194192 | 94686.55 |
Dependent variable:
House Price
Independent variables:
1. reno_dum (renovation status)
2. bedrooms (number of bedrooms)
3. bath_dum (category of bathroom count)
4. sqft_living (living area square feet)
5. sqft_lot (lot square feet)
6. floor_cat (category by floors)
7. water_dum (waterfront factor)
8. view_cat (view quality)
9. condition_cat (condition level)
10. grade_dum (grade level)
11. white_share (white population share)
12. median_hh_income (median household income)
13. sch_cat (school districts)
14. park_cat (number of nearby parks)
15. shop_dis1 (distance to the nearest shopping center)
16. T_NAME (census tracts)
seattle.train.lm <- seattle.train.lm %>%
mutate(T_NAME = as.factor(T_NAME))
par(mfrow = c(2, 2))
# with fixed effect
lm.final.nhood <- lm(price ~ reno_dum + bedrooms + bath_dum + sqft_living + sqft_lot +
floor_cat + water_dum + view_cat + condition_cat + grade_dum +
white_share + median_hh_income + sch_cat + park_cat + shop_dis1 +
T_NAME, data = seattle.train.lm)
plot(lm.final.nhood)
mtext("Diagnostic Plots for Linear Model with Fixed Effect", side = 3, line = -2, outer = TRUE, cex = 1.1, font = 2)
balabalbala
Amenities impact: https://www.rentseattle.com/blog/how-local-amenities-help-seattle-investors-find-good-properties
reference: “houses within a quarter mile to a half mile of a Metro station sold for 7.5% more, and houses within a half mile to a mile sold for 3.9% more.” https://www.freddiemac.com/research/insight/20191002-metro-station-impact#:~:text=Similarly%2C%20houses%20within%20a%20quarter,mile%20sold%20for%203.9%25%20more.
variable: the name of the school district
reference: “the price differentials for similar homes — same square footage, number of bedrooms and baths — that are located near each other but served by different school districts can range from tens of thousands to hundreds of thousands of dollars.” https://www.seattletimes.com/business/how-much-do-good-schools-boost-home-prices/
reference: “Most community-sized parks (~40 acres) had a substantial impact on home prices up to a distance of 500 to 600 feet. While the influence of larger parks extended out to 2,000 feet, beyond 500 feet the influence was relatively small.” https://www.naturequant.com/blog/Impact-of-Parks-on-Property-Value/
reference: “a city facing major development pressure — trees were associated with an increase in single family home values” https://www.vibrantcitieslab.com/resources/urban-trees-increase-home-values/
source: https://data-seattlecitygis.opendata.arcgis.com/datasets/SeattleCityGIS::seattle-tree-canopy-2016-2021-50-acre-hexagons/explore?layer=1&location=47.580733%2C-122.309741%2C11.26
reference: “hospitals would only be highly evaluated in a ‘close-but-not-too-close’ geographic location” https://www.researchgate.net/publication/282942128_The_non-linearity_of_hospitals'_proximity_on_property_prices_experiences_from_Taipei_Taiwan
source: https://gis-kingcounty.opendata.arcgis.com/datasets/1b7f0fb5179a400f91a35c0b6bfd77c9_733/explore
reference:“only robbery and aggravated assault crimes (per acre) exert a meaningful influence upon neighborhood housing values.” https://sciencedirect.com/science/article/pii/S0166046210000086#aep-abstract-id3 “The overall effect on house prices of crime (measured as crime rates) is relatively small, but if its impact is measured by distance to a crime hot spot, the effect is non-negligible.” https://www.researchgate.net/publication/335773241_Do_crime_hot_spots_affect_housing_prices_Do_crime_hot_spots_affect_housing_prices
source:https://data.seattle.gov/Public-Safety/SPD-Crime-Data-2008-Present/tazs-3rd5/about_data